# download_ohd_issue.py
# OHD (Journal of Open Humanities Data) Downloader
# -------------------------------------------------
# Automates downloading PDFs from OHD issue pages
#
# ✔ Parses article titles and direct PDF links from live issue pages using requests + BeautifulSoup
# ✔ Follows each /articles/ link to find the embedded PDF viewer iframe
# ✔ Extracts actual PDF download URL from the <iframe> "src" attribute (which contains a query param with the file URL)
# ✔ Skips non-article entries like 'About', 'Contact', 'Editorial', 'News', etc.
# ✔ Automatically creates a folder named from the <title> tag (e.g., OHD_Volume6_2020)
# ✔ Filenames are sanitized (no invalid Windows characters) and capped at 200 characters
# ✔ Each successfully downloaded PDF is saved into the folder and optionally logged in a CSV (optional extension)
# ✔ Compatible with all OHD issues hosted under https://openhumanitiesdata.metajnl.com/issue/*
#
# Requirements: requests, BeautifulSoup4


import os
import re
import csv
from bs4 import BeautifulSoup

MAX_FILENAME_LEN = 200

def sanitize(text):
    text = text.replace("\n", " ").replace("\r", " ")
    clean = re.sub(r'[\\/*?:"<>|]', "", text)
    clean = re.sub(r"\s+", " ", clean)
    return clean.strip()[:MAX_FILENAME_LEN]

# --- Prompt for folder ---
folder = input("Enter folder path containing PRPER HTML and PDF files: ").strip()

html_files = sorted(
    [f for f in os.listdir(folder) if f.lower().endswith(".html")],
    key=lambda x: os.path.getmtime(os.path.join(folder, x))
)
pdf_files = sorted(
    [f for f in os.listdir(folder) if f.lower().endswith(".pdf")],
    key=lambda x: os.path.getmtime(os.path.join(folder, x))
)

if len(html_files) != len(pdf_files):
    print(f"[WARNING] Number of HTMLs ({len(html_files)}) and PDFs ({len(pdf_files)}) differ.")
    print("Ensure HTMLs and PDFs were saved in correct sequence.")

log_path = os.path.join(folder, "renamed_prper_log.csv")
with open(log_path, "w", newline="", encoding="utf-8") as log_file:
    writer = csv.writer(log_file)
    writer.writerow(["Original PDF", "New Filename", "Extracted Title"])
    count = 0

    for html_name, pdf_name in zip(html_files, pdf_files):
        html_path = os.path.join(folder, html_name)
        pdf_path = os.path.join(folder, pdf_name)

        with open(html_path, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f.read(), "html.parser")

        title_tag = soup.find("h1")
        if not title_tag:
            print(f"[SKIP] No title found in {html_name}")
            continue

        title = title_tag.get_text(strip=True)
        new_name = sanitize(title) + ".pdf"
        new_path = os.path.join(folder, new_name)

        idx = 1
        while os.path.exists(new_path):
            new_name = sanitize(title) + f" {idx}.pdf"
            new_path = os.path.join(folder, new_name)
            idx += 1

        os.rename(pdf_path, new_path)
        writer.writerow([pdf_name, new_name, title])
        count += 1
        print(f"[{count}] {pdf_name} → {new_name}")

print(f"\n✅ Done! {count} PDFs renamed.")
print(f"📄 Log file saved as: {log_path}")
